{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's do the first investigation of CNVs, using the guidelines in the CNVnator paper to look for DNVs in trios.\n",
    "\n",
    "We have 3 different window choices, so let's start with one and we can plot the results later to see how our findings change. But start by parsing the trios, so we know the different identifities."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import glob\n",
    "\n",
    "\n",
    "data_dir = '/data/NCR_SBRB/simplex/'\n",
    "trios = {}\n",
    "affected = []\n",
    "controls = []\n",
    "peds = glob.glob(data_dir + '*trio*ped')\n",
    "for ped in peds:\n",
    "    trio_name = ped.split('/')[-1].split('.')[0]\n",
    "    fid = open(ped, 'r')\n",
    "    fam = {}\n",
    "    for line in fid:\n",
    "        famid, sid, fa, mo, sex, aff = line.rstrip().split('\\t')\n",
    "        if fa != '0':\n",
    "            fam['child'] = sid\n",
    "            if aff == '1':\n",
    "                affected.append(trio_name)\n",
    "            else:\n",
    "                controls.append(trio_name)\n",
    "        elif sex == '1':\n",
    "            fam['father'] = sid\n",
    "        else:\n",
    "            fam['mother'] = sid\n",
    "    trios[trio_name] = fam\n",
    "    fid.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can examine a single family and window:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "w = 500\n",
    "trio_name = '855_trio1'\n",
    "\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "def get_dnvs(trio, data_dir, w):\n",
    "    child = pd.read_table(data_dir + \\\n",
    "                          'cnvnator/%s_calls_w%d.txt' % (trio['child'], w),\n",
    "                         header = False,\n",
    "                         names = ['CNV_type', 'coordinates', 'CNV_size',\n",
    "                                  'normalized_RD', 'e-val1', 'e-val2',\n",
    "                                  'e-val3', 'e-val4', 'q0'])\n",
    "    father = pd.read_table(data_dir + \\\n",
    "                          'cnvnator/%s_calls_w%d.txt' % (trio['father'], w),\n",
    "                         header = False,\n",
    "                         names = ['CNV_type', 'coordinates', 'CNV_size',\n",
    "                                  'normalized_RD', 'e-val1', 'e-val2',\n",
    "                                  'e-val3', 'e-val4', 'q0'])\n",
    "    mother = pd.read_table(data_dir + \\\n",
    "                          'cnvnator/%s_calls_w%d.txt' % (trio['mother'], w),\n",
    "                         header = False,\n",
    "                         names = ['CNV_type', 'coordinates', 'CNV_size',\n",
    "                                  'normalized_RD', 'e-val1', 'e-val2',\n",
    "                                  'e-val3', 'e-val4', 'q0'])\n",
    "    denovo = []\n",
    "    for cnv_type in ['duplication', 'deletion']:\n",
    "        cnvs = child['coordinates'][child['CNV_type'] == cnv_type]\n",
    "        child_pos = pd.DataFrame([[s.split(':')[0],\n",
    "                                   int(s.split(':')[1].split('-')[0]),\n",
    "                                   int(s.split(':')[1].split('-')[1])]\n",
    "                                  for s in cnvs],\n",
    "                                columns=['chr', 'start', 'end'])\n",
    "        cnvs = father['coordinates'][father['CNV_type'] == cnv_type]\n",
    "        father_pos = pd.DataFrame([[s.split(':')[0],\n",
    "                                   int(s.split(':')[1].split('-')[0]),\n",
    "                                   int(s.split(':')[1].split('-')[1])]\n",
    "                                  for s in cnvs],\n",
    "                                columns=['chr', 'start', 'end'])\n",
    "        cnvs = mother['coordinates'][mother['CNV_type'] == cnv_type]\n",
    "        mother_pos = pd.DataFrame([[s.split(':')[0],\n",
    "                                   int(s.split(':')[1].split('-')[0]),\n",
    "                                   int(s.split(':')[1].split('-')[1])]\n",
    "                                  for s in cnvs],\n",
    "                                columns=['chr', 'start', 'end'])\n",
    "\n",
    "        # for each child CNV, make sure there's no father or mother CNV that\n",
    "        # overlap it\n",
    "        for index, row in child_pos.iterrows():\n",
    "            mother_start_overlap = np.sum((mother_pos['start'] < row['start']) &\n",
    "                                          (mother_pos['end'] > row['start']))\n",
    "            mother_end_overlap = np.sum((mother_pos['start'] < row['end']) &\n",
    "                                          (mother_pos['end'] > row['end']))\n",
    "            father_start_overlap = np.sum((father_pos['start'] < row['start']) &\n",
    "                                          (father_pos['end'] > row['start']))\n",
    "            father_end_overlap = np.sum((father_pos['start'] < row['end']) &\n",
    "                                          (father_pos['end'] > row['end']))\n",
    "            if mother_start_overlap + mother_end_overlap + \\\n",
    "               father_start_overlap + father_end_overlap == 0:\n",
    "                    denovo.append(row)\n",
    "    return(pd.DataFrame(denovo, index=range(len(denovo))))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's start witht the 500 window because the files are smaller, so they go faster:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dnvs_500 = {}\n",
    "for trio in trios.iterkeys():\n",
    "    dnvs_500[trio] = get_dnvs(trios[trio], data_dir, 500)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Although this approach might work, I just noticed that they took a different approach in the paper! Basically, for each q0-filtered CNV in the affect child, they specifically genotyped those regions in the parents, and called DNVs based on their criteria. Not sure if I'd do that for every kid, but that makes sense. I could also, as a second check, genotype those specific regions from the affected child in the unaffected child. \n",
    "\n",
    "OK, so according to this issue (https://github.com/abyzovlab/CNVnator/issues/86) in the CNVnator webpage, the author doesn't recommend using it for WES. But, this review paper (https://molecularcytogenetics.biomedcentral.com/articles/10.1186/s13039-017-0333-5) did use it to compare to XMHH and CONIFER. So, the main issue here is that CNVnator doesn't genotype specific regions if a window of 1000 bp is not calculated. It's hard coded in the software... weird. But when I try to calculate it for some of my subjects, it doesn't work. I get Abnormal interval errors. It's likely because it's WES, and not WGS. So, we have two issues:\n",
    "\n",
    "* use a window that best suits our dpth of coverage\n",
    "* get around the genotyping hard code that CNVnaotr has of 1000 bp\n",
    "\n",
    "To figure out DOC, I did:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "file - (len 99)\n",
      "    col 0: min = 20.1100, mean = 57.3626, max = 120.0700, stdev = 13.9842\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[+] Loading AFNI current-openmp ...\n",
      "AFNI/current-openmp last updated  2017-11-02\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "module load afni\n",
    "\n",
    "\n",
    "cd /data/NCR_SBRB/simplex/xhmm\n",
    "for f in `ls *.DOC.sample_summary`; do tail -1 $f; done | cut -f 3 | 1d_tool.py -show_mmms -infile -"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The CNVnator paper (http://genome.cshlp.org/content/21/6/974.full.pdf+html) recommends ~100-bp bins for 20–30X coverage, ~500-bp bins for 4–6X coverage, and ~30-bp bins for 100X coverage. In the review paper they used  50–60bp bins for their coverage of 45–70X. So, even though my mean is around 50, which should make 50bp a good window, I have a few subjects with 20X, so I think 100bps is a safe number. Maybe once all is good I can try 50 as well just for SaGs...\n",
    "\n",
    "But after gettting several errors, such as not being able to genotype even the window I did have (even after I created 1000bp windows in subjects that could do that!), I decided to shelf explorations with CNVnator so we can focus on other tools that might be more promising, such as XHMM, CNVkit, Conifer, ExomeDepth, ExomeCopy, and Excavator2tool."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "-----"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'child': 'CLIA_400216', 'father': 'CLIA_400176', 'mother': 'CLIA_400177'}"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trios['855_trio1']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CNV_type</th>\n",
       "      <th>coordinates</th>\n",
       "      <th>CNV_size</th>\n",
       "      <th>normalized_RD</th>\n",
       "      <th>e-val1</th>\n",
       "      <th>e-val2</th>\n",
       "      <th>e-val3</th>\n",
       "      <th>e-val4</th>\n",
       "      <th>q0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>duplication</td>\n",
       "      <td>chr1:10051-470520</td>\n",
       "      <td>460470</td>\n",
       "      <td>3843240.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.765100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>deletion</td>\n",
       "      <td>chr1:470521-521400</td>\n",
       "      <td>50880</td>\n",
       "      <td>530372.0</td>\n",
       "      <td>0.232258</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>29355.500000</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>duplication</td>\n",
       "      <td>chr1:521401-2725920</td>\n",
       "      <td>2204520</td>\n",
       "      <td>3395790.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.112426</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>deletion</td>\n",
       "      <td>chr1:2725921-2743410</td>\n",
       "      <td>17490</td>\n",
       "      <td>97601.6</td>\n",
       "      <td>0.077662</td>\n",
       "      <td>9.070830e-167</td>\n",
       "      <td>0.477643</td>\n",
       "      <td>6.692910e-147</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>duplication</td>\n",
       "      <td>chr1:2743411-2946210</td>\n",
       "      <td>202800</td>\n",
       "      <td>302104.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.014721</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      CNV_type           coordinates  CNV_size  normalized_RD    e-val1  \\\n",
       "0  duplication     chr1:10051-470520    460470      3843240.0  0.000000   \n",
       "1     deletion    chr1:470521-521400     50880       530372.0  0.232258   \n",
       "2  duplication   chr1:521401-2725920   2204520      3395790.0  0.000000   \n",
       "3     deletion  chr1:2725921-2743410     17490        97601.6  0.077662   \n",
       "4  duplication  chr1:2743411-2946210    202800       302104.0  0.000000   \n",
       "\n",
       "          e-val2        e-val3         e-val4        q0  \n",
       "0   0.000000e+00      0.000000   0.000000e+00  0.765100  \n",
       "1   0.000000e+00  29355.500000   0.000000e+00  1.000000  \n",
       "2   0.000000e+00      0.000000   0.000000e+00  0.112426  \n",
       "3  9.070830e-167      0.477643  6.692910e-147  0.000000  \n",
       "4   0.000000e+00      0.000000   0.000000e+00  0.014721  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "child.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(60000000,)\n"
     ]
    }
   ],
   "source": [
    "b = np.zeros(60000000)\n",
    "print b.shape"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:py2.7.10]",
   "language": "python",
   "name": "conda-env-py2.7.10-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  },
  "nav_menu": {},
  "toc": {
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 6,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
